Visualize predictions

In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[1]:
In [1]:
import numpy as np
import sklearn.metrics as metrics
from inspect import signature
import seaborn as sns
import shap
from sklearn import preprocessing

import matplotlib.pyplot as plt
shap.initjs()

import pandas as pd

figuresize = (8,6)

%matplotlib inline
import os
In [4]:
# Get the data
all_testy_hat = np.load('all_testy_hat.npy')
all_testys = np.load('all_testys.npy')
all_trainy_hat = np.load('all_trainy_hat.npy')
all_trainys = np.load('all_trainys.npy')
all_validy_hat = np.load('all_validy_hat.npy')
all_validys = np.load('all_validys.npy')
cutpoint_avg = np.load('cutpoint_avg.npy')
test_evalmetrics = np.load('test_evalmetrics.npy')
threshold_avg = np.load('threshold_avg.npy')
train_evalmetrics = np.load('train_evalmetrics.npy')
valid_evalmetrics = np.load('valid_evalmetrics.npy')
all_trainy_hat_1 = all_trainy_hat[[i==1.0 for i in all_trainys]]
all_trainy_hat_0 = all_trainy_hat[[i==0.0 for i in all_trainys]]
all_train_percentile_1 = np.percentile(all_trainy_hat_1, [0,10,20,30,40,50,60,70,80,90,100])
all_train_percentile_0 = np.percentile(all_trainy_hat_0, [0,10,20,30,40,50,60,70,80,90,100])


all_testy_hat_1 = all_testy_hat[[i==1.0 for i in all_testys]]
all_testy_hat_0 = all_testy_hat[[i==0.0 for i in all_testys]]
all_test_percentile_1 = np.percentile(all_testy_hat_1, [0,10,20,30,40,50,60,70,80,90,100])
all_test_percentile_0 = np.percentile(all_testy_hat_0, [0,10,20,30,40,50,60,70,80,90,100])
In [5]:
y_true = all_testys       # true labels
y_probas = all_testy_hat  # predicted results
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_probas, pos_label=1)
auc = np.trapz(tpr,fpr)

# Print ROC curve
plt.figure(figsize=figuresize)
plt.title('ROC curve with AUC = {}'.format(auc))
plt.plot(fpr,tpr)
plt.show() 

# Print AUC
print('AUC:', auc)
AUC: 0.9924205848561507
In [6]:
precision, recall, _ = metrics.precision_recall_curve(y_true, y_probas)
average_precision = metrics.average_precision_score(y_true, y_probas)

plt.figure(figsize=figuresize)
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')

plt.ylabel('Precision')
#plt.ylim([0.0, .05])
#plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
Out[6]:
Text(0.5, 1.0, '2-class Precision-Recall curve: AP=0.67')
  • Blue is for the model's prediction of groundtruth 'Not clicked'
  • Red is for the model's prediction of groundtruth 'Clicked'

SHAP values

Dataset

In [9]:
import tensorflow as tf
In [10]:
import logging
#logger = logging.getLogger()
#logger.setLevel(logging.DEBUG)
#logging.debug("test")
In [11]:
import training
from config import args, model_hps, train_hps, input_pipeline_hps, dir_hps #, Colnames, REDIS_PWD, REDIS_HOSTS_SETS
from utils import output2csv #, transfer2redis
from config import IAB_NUM, LABEL_NUM_1,CLIENT_LABEL_NUM, ADCATEGORIES_NUM, INT_FEATURES_LIST, FLOAT_FEATURES_LIST
In [12]:
model_hps
Out[12]:
HParams([('Lambda', 0.2749399860397595), ('decay_rate', 0.9636366606929614), ('decay_step', 100), ('drop_rate', 0.47133755091837504), ('embedding_units', [3, 3, 3, 100, 20, 50, 23]), ('embedding_units_ohe', [10, 10]), ('gamma', 0.7428210911379075), ('hidden_units', [128, 64]), ('learning_rate', 0.008406203935304156), ('loss', 40)])
In [13]:
train_hps
Out[13]:
HParams([('earlystop_check_frequency', 10), ('earlystop_duration', 10), ('has_gpu', 0), ('is_percentile_threshold', False), ('is_test', 1), ('model', 'DNN'), ('num_threshold_buffer', 3), ('num_training', 11009), ('num_training_min', 10900), ('percentile_threshold', 8), ('print_train_iter', 545), ('save_model', 545), ('test_length', 300), ('valid_loss_delta', 0.01), ('validation_frequency', 545), ('validation_length', 1)])
In [15]:
dir_hps
Out[15]:
HParams([('builder_save_dir', 'builder_save'), ('load_dir', 'DNN_2019-06-14T03:59:22'), ('result_dir', './Outputs/result.csv'), ('save_dir', './Outputs/'), ('store_dir', 'DNN_2019-06-14T10:06:28')])
In [16]:
input_pipeline_hps.batch_size = 256
In [17]:
DATASET_SIZE = 0
for fn in input_pipeline_hps.data_file:
    for _ in tf.python_io.tf_record_iterator(fn):
         DATASET_SIZE += 1
logging.info('TOTAL DATASET_SIZE = %d', DATASET_SIZE)
In [18]:
DATASET_SIZE
Out[18]:
19949
In [19]:
def extract_tfrecords(data_record):
            
    features =  {}  
    for int_feature in INT_FEATURES_LIST:
        features[int_feature] = tf.FixedLenFeature([1], tf.int64)
    for float_feature in FLOAT_FEATURES_LIST:
        features[float_feature] = tf.FixedLenFeature([1], tf.float32)

    features['var1'] = tf.VarLenFeature(tf.int64)
    features['var2'] = tf.VarLenFeature(tf.int64)
    features['var3'] = tf.FixedLenFeature([1, LABEL_NUM_1], tf.int64)
    features['var4'] = tf.FixedLenFeature([1, LABEL_NUM_2], tf.int64)

    sample = tf.parse_single_example(data_record, features)
    tf.reshape(sample['var3'],[tf.shape(sample['var3'])[0],-1])

    sample["var1"] = tf.sparse_tensor_to_dense(sample["var1"], default_value=0)
    sample["var1"] = tf.reduce_sum(tf.one_hot(sample["var1"], depth=LABEL_NUM_3),axis=0)
    sample["var2"] = tf.sparse_tensor_to_dense(sample["var2"], default_value=0)
    sample["var2"] = tf.reduce_sum(tf.one_hot(sample["var2"], depth=LABEL_NUM_4),axis=0)
    sample["var5"] = tf.cast(sample["var5"], tf.int32)

    y = sample["label"]

    return (sample, y)
In [20]:
full_dataset = tf.data.TFRecordDataset(input_pipeline_hps.data_file)
        
if input_pipeline_hps.is_test:
    test_ratio = 1.0 - input_pipeline_hps.train_ratio - input_pipeline_hps.valid_ratio 
    train_size = int(input_pipeline_hps.train_ratio * DATASET_SIZE)
    valid_size = int(input_pipeline_hps.valid_ratio * DATASET_SIZE)
    test_size  = int(test_ratio * DATASET_SIZE)
    logging.info('train_size: %d valid_size: %d test_size: %d', train_size, valid_size, test_size)

    full_dataset = full_dataset.shuffle(buffer_size=DATASET_SIZE)
    full_dataset = full_dataset.map(extract_tfrecords,
                        num_parallel_calls=input_pipeline_hps.num_cores)
    train_dataset = full_dataset.take(train_size)
    test_dataset = full_dataset.skip(train_size)
    valid_dataset = test_dataset.skip(valid_size)
    test_dataset = test_dataset.take(test_size)

else:
    train_size = int(input_pipeline_hps.train_ratio * DATASET_SIZE)
    valid_size = DATASET_SIZE - train_size
    logging.info('train_size: %d valid_size: %d', train_size, valid_size)

    full_dataset = full_dataset.shuffle(buffer_size=DATASET_SIZE)
    full_dataset = full_dataset.map(extract_tfrecords, num_parallel_calls=input_pipeline_hps.num_cores)
    train_dataset = full_dataset.take(train_size)
    valid_dataset = full_dataset.skip(train_size)
    
sess = tf.InteractiveSession()

train_dataset = train_dataset.repeat()
train_dataset = train_dataset.batch(input_pipeline_hps.batch_size) #train_size)
train_dataset = train_dataset.prefetch(input_pipeline_hps.prefetch_size)

train_iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
train_init_op = train_iterator.make_initializer(train_dataset)
train_next_batch = train_iterator.get_next()

sess.run(train_init_op)

test_dataset = test_dataset.repeat()
test_dataset = test_dataset.batch(input_pipeline_hps.batch_size) #test_size
test_dataset = test_dataset.prefetch(input_pipeline_hps.prefetch_size)

test_iterator = tf.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes)
test_init_op = test_iterator.make_initializer(test_dataset)
test_next_batch = test_iterator.get_next()

sess.run(test_init_op)

train_data = sess.run(train_next_batch)
test_data = sess.run(test_next_batch)
WARNING:tensorflow:From /home/rosalie/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
WARNING:tensorflow:From /home/rosalie/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/sparse_ops.py:1165: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.
In [22]:
np.shape(train_data[0]['lab1']), np.shape(train_data[0]['lab2']), np.shape(train_data[0]['lab3'])
Out[22]:
((256, 1, 253), (256, 393), (256, 3181))
In [24]:
features = #not shown
len(features)
Out[24]:
14

Helper functions

In [25]:
import numpy as np
def preprocessor(train_data_dict):
    train_data_batch = np.hstack((train_data_dict[0][features[0]],train_data_dict[0][features[1]]))

    for feature in features[2:]:
        train_data_batch = np.hstack((train_data_batch,train_data_dict[0][feature]))

    train_data_batch = np.hstack((train_data_batch,np.reshape(train_data_dict[0]['lab1'], (input_pipeline_hps.batch_size, LABEL_NUM_1))))
    train_data_batch = np.hstack((train_data_batch,np.reshape(train_data_dict[0]['lab2'], (input_pipeline_hps.batch_size, LABEL_NUM_2))))
    train_data_batch = np.hstack((train_data_batch,np.reshape(train_data_dict[0]['lab3'], (input_pipeline_hps.batch_size, LABEL_NUM_3))))

    return train_data_batch
In [26]:
def data_to_df(data):
    
    cols = features
    cols = cols + ['lab1_' + str(i) for i in range(LABEL_NUM_1)]
    cols = cols + ['lab2_' + str(i) for i in range(LABEL_NUM_2)]
    cols = cols + ['lab3_' + str(i) for i in range(LABEL_NUM_3)]
    
    data_df = pd.DataFrame(data)
    data_df.columns = cols
    return data_df
In [27]:
class DNNPredictor(object):
    def __init__(self, model_hps,input_pipeline_hps,dir_hps):
        tf.reset_default_graph()
        self.model_hps = model_hps
        self.input_pipeline_hps = input_pipeline_hps
        self.dir_hps = dir_hps
        self.sess = tf.Session()
        
        self.build_model()
        
    def build_model(self):
        from model.dnn import DNN
        self._model = DNN(self.input_pipeline_hps,self.model_hps)
        self._model.core_builder()
        self.sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(max_to_keep=10)
        
        if dir_hps.save_dir:
            model_file = tf.train.latest_checkpoint(self.dir_hps.save_dir + self.dir_hps.load_dir)
        if model_file:
            logging.info('Restoring from: %s', model_file)
            saver.restore(self.sess, model_file)
            
    def predict(self, instances):
        
        num_rows = len(instances)
    
        model_feed_dict = { 
                self._model.phase: 0,

                self._model.features_dict['var1'][2]: instances[:,0].reshape(num_rows, 1),
                self._model.features_dict['var2'][2]: instances[:,1].reshape(num_rows, 1),
                self._model.features_dict['var3'][2]: instances[:,2].reshape(num_rows, 1),
                self._model.features_dict['var3'][2]: instances[:,3].reshape(num_rows, 1),
                self._model.features_dict['var4'][2]: instances[:,4].reshape(num_rows, 1LABEL_NUM_1
                self._model.features_dict['var5'][2]: instances[:,5].reshape(num_rows, 1),
                self._model.features_dict['var5'][2]: instances[:,6].reshape(num_rows, 1),    

                self._model.var6: instances[:,7].reshape(num_rows, 1),
                self._model.var7: instances[:,8].reshape(num_rows, 1),
                self._model.var8: instances[:,9].reshape(num_rows, 1),
                self._model.var9: instances[:,10].reshape(num_rows, 1),
                self._model.var10:instances[:,11].reshape(num_rows, 1),
                self._model.var11:instances[:,12].reshape(num_rows, 1),
                self._model.var12:instances[:,13].reshape(num_rows, 1),

                self._model.lab1:instances[:,14:14+LABEL_NUM_1].reshape(num_rows, LABEL_NUM_1),
                self._model.lab2:instances[:,14+LABEL_NUM_1:14+LABEL_NUM_1+LABEL_NUM_2].reshape(num_rows, LABEL_NUM_2),
                self._model.lab3:instances[:,14+LABEL_NUM_1+LABEL_NUM_2:14+LABEL_NUM_1+LABEL_NUM_2+LABEL_NUM_3].reshape(num_rows, LABEL_NUM_3),

            }
        
        outputs = self.sess.run(self._model.y_prob,feed_dict = model_feed_dict)
        return outputs

SHAP explainer

In [28]:
dnnobj = DNNPredictor(model_hps,input_pipeline_hps,dir_hps)
INFO:tensorflow:Restoring parameters from ./Outputs/DNN_2019-06-14T03:59:22/model
INFO:tensorflow:Restoring parameters from ./Outputs/DNN_2019-06-14T03:59:22/model
In [29]:
train_data = sess.run(train_next_batch)
test_data = sess.run(test_next_batch)

train_data_batch = preprocessor(train_data)
test_data_batch = preprocessor(test_data)

train_data = data_to_df(train_data_batch)
test_data = data_to_df(test_data_batch)
  • First summarize the training data into n clusters => for fast processing
  • This is an optional but helpful step, because the time to generate Shapley values increases exponentially with the size of the dataset
In [ ]:
data_summary = shap.kmeans(train_data, 25)
  • KernelExplainer is model-agnostic, as it takes the model predictions and training data as input.
  • Instantiate an explainer with the model predictions and training data summary:
In [30]:
explainer = shap.KernelExplainer(dnnobj.predict, data_summary, link="logit")
In [31]:
data = test_data
# Extract Shapley values from the explainer shap_values = explainer.shap_values(data.iloc[0:input_pipeline_hps.batch_size,:], nsamples=input_pipeline_hps.batch_size)
  • plot the SHAP values of any ith observation
  • below in this particular example, the region_code contributed most to the CTR prediction of 0.82, pushing its value higher
  • while minute was the second most important feature, pushing the prediction value lower
In [58]:
ith_obs = 249

shap.force_plot(explainer.expected_value[0], shap_values[0][ith_obs,:],data.iloc[ith_obs,:],link="logit")
Out[58]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
  • dynamically plot the SHAP values of all instances
In [34]:
NUM_ROWS = input_pipeline_hps.batch_size
shap.force_plot(explainer.expected_value, shap_values[0], data.iloc[0:NUM_ROWS,:], link="logit")
Out[34]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
  • Below, the summary plot displays a distribution of Shapley values for each feature.
  • The color of each point is on a spectrum where highest values for that feature are red, and lowest values are blue.
  • The features are ranked by the sum of the absolute values of their Shapley values across all samples
  • The first three features with the highest contribution are 'publisher','region_code', and 'wh'
In [35]:
# A way to see the relative impact of all features over the entire dataset.
shap.summary_plot(shap_values[0], data)
  • plotting the dependence plot shows the correlation of a feature to its shapley value
  • itcan also show the interaction of two features
  • below, viewability is positively correlated with its shapley values
  • its interaction with bidfloor is not conclusive
In [61]:
shap.dependence_plot("viewability", shap_values[0], data,interaction_index="bidfloor")
In [63]:
shap.dependence_plot("wh", shap_values[0], data,interaction_index="minute")
In [62]:
shap.dependence_plot("click_through_rate", shap_values[0], data,interaction_index="bidfloor")
In [65]:
shap.dependence_plot("minute", shap_values[0], data,interaction_index="viewability")
In [42]:
shap.dependence_plot("bidfloor", shap_values[0], data,interaction_index="minute")
In [68]:
shap.dependence_plot("publisher", shap_values[0], test_data,interaction_index=None)
In [69]:
shap.dependence_plot("minute", shap_values[0], test_data,interaction_index=None)
  • barplot summarizing the Shapley values
In [70]:
shap.summary_plot(shap_values[0], data, plot_type="bar")